rm(list=ls(all=TRUE))
library(tidyverse)
Warning: package ‘tidyverse’ was built under R version 4.1.3Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
-- Attaching packages ----------------------------------------------------------------------------------------------------------------- tidyverse 1.3.2 --
√ ggplot2 3.3.5 √ purrr 0.3.4
√ tibble 3.1.6 √ dplyr 1.0.7
√ tidyr 1.1.4 √ stringr 1.4.0
√ readr 2.1.1 √ forcats 0.5.1
-- Conflicts -------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
library(magrittr)
Attaching package: ‘magrittr’
The following object is masked from ‘package:purrr’:
set_names
The following object is masked from ‘package:tidyr’:
extract
library(jsonlite)
Attaching package: ‘jsonlite’
The following object is masked from ‘package:purrr’:
flatten
library(broom)
Read example json file:
results_dir <- "microbeMASST_results/"
json_example <- read_json(str_c(results_dir, "fastMASST_HILIC_neg__165_microbe.json"), simplifyVector = F)
Define function for iterating over nodes in the MASST output:
iterate_masst <- function(masst_node){
node_attributes <- names(masst_node)
if ("Rank" %in% node_attributes && masst_node$Rank == "phylum") {
tibble(
NAME = masst_node$name,
TYPE = masst_node$type,
NCBI = masst_node$NCBI,
RANK = masst_node$Rank,
GROUP_SIZE = masst_node$group_size,
MATCHED_SIZE = masst_node$matched_size
)
}
else {
if ("type" %in% node_attributes && masst_node$type == "node") {
lapply(masst_node$children, iterate_masst) %>%
bind_rows()
}
else {
tibble(
NAME = character(),
TYPE = character(),
NCBI = character(),
RANK = character(),
GROUP_SIZE = integer(),
MATCHED_SIZE = integer()
)
}
}
}
iterate_masst(json_example)
Set up tibble initialized with all json file names:
masst_results <- tibble(FILE_NAME = dir(results_dir, ".*\\.json"))
masst_results
Parse info from file names:
masst_results <- masst_results %>%
mutate(
SEARCH_TYPE = FILE_NAME %>% str_extract("food|microbe"),
DATASET = FILE_NAME %>% str_extract("(HILIC|RP).+(pos|neg)"),
DATASET = if_else(DATASET == "RP_neg", "RP18_neg", DATASET),
SCAN = FILE_NAME %>% str_extract("__.+_") %>% str_extract("[0-9]+"),
FEATURE_ID = str_c(
case_when(
DATASET == "RP18_pos" ~ "X94",
DATASET == "RP18_neg" ~ "X95",
DATASET == "HILIC_pos" ~ "X96",
DATASET == "HILIC_neg" ~ "X97"
),
SCAN %>% str_pad(max(nchar(SCAN)), "left", "0")
)
)
masst_results
Number of results files per dataset:
masst_results %>%
count(DATASET)
Read json files for microbeMASST:
masst_results <- masst_results %>%
mutate(
PATH = str_c(results_dir, FILE_NAME),
JSON = PATH %>% map(read_json),
STATS_PHYLUM = JSON %>% map(iterate_masst)
)
masst_results$JSON[[1]][setdiff(names(masst_results$JSON[[1]]), c("children", "pie_data"))]
$name
[1] "root"
$duplication
[1] "Y"
$type
[1] "node"
$NCBI
[1] "131567"
$Rank
[1] "cellular organisms"
$group_size
[1] 72560
$matched_size
[1] 3
$occurrence_fraction
[1] 4.134509e-05
Add stats for phylum:
masst_results <- masst_results %>%
mutate(
STATS_ROOT = JSON %>% map(~ tibble(ROOT_GROUP_SIZE = .$group_size, ROOT_MATCHED_SIZE = .$matched_size)),
STATS_PHYLUM = JSON %>% map(iterate_masst)
)
masst_results$STATS_ROOT[[1]]
masst_results$STATS_PHYLUM[[1]]
Select relevant columns and unnest stats:
masst_results <- masst_results %>%
select(FEATURE_ID, DATASET, SEARCH_TYPE, STATS_ROOT, STATS_PHYLUM) %>%
unnest(c(STATS_ROOT, STATS_PHYLUM))
masst_results
Check: Is there any other TYPE than “node”?
masst_results$TYPE %>% unique()
[1] "node"
Check: Is there any other RANK than “phylum”?
masst_results$RANK %>% unique()
[1] "phylum"
Perform Fisher’s exact test for the association between features and phyla:
masst_results <- masst_results %>%
filter(MATCHED_SIZE > 0) %>%
mutate(
FISHER = pmap(
list(
ROOT_GROUP_SIZE,
ROOT_MATCHED_SIZE,
GROUP_SIZE,
MATCHED_SIZE
),
~ fisher.test(
matrix(
c(..1, ..2, ..3, ..4),
nrow = 2
)
)
),
FISHER = FISHER %>% map(tidy)
) %>%
unnest(FISHER)
masst_results
Perform correction for multiple testing and check distribution of p-values:
masst_results <- masst_results %>%
mutate(p.value.fdr = p.value %>% p.adjust(method = "fdr"))
masst_results %>%
ggplot() +
geom_point(aes(p.value, p.value.fdr)) +
geom_abline(slope = 1)
masst_results %>%
ggplot() +
geom_histogram(aes(p.value.fdr, fill = DATASET), bins = 100) +
scale_x_continuous(breaks = 0:5/5)
masst_results %>%
filter(p.value.fdr < 0.1) %>%
ggplot() +
geom_histogram(aes(p.value.fdr, fill = DATASET), bins = 100) +
scale_x_continuous(breaks = 0:5/50)
masst_results %>%
filter(p.value.fdr < 0.01) %>%
ggplot() +
geom_histogram(aes(p.value.fdr, fill = DATASET), bins = 100) +
scale_x_continuous(breaks = 0:5/500)
masst_results %>%
filter(p.value.fdr < 0.001) %>%
ggplot() +
geom_histogram(aes(p.value.fdr, fill = DATASET), bins = 100) +
scale_x_continuous(breaks = 0:5/5000)
Filter for a p-value < 0.01:
masst_results <- masst_results %>%
filter(p.value.fdr < 0.01)
masst_results
Number of significant hits per dataset:
masst_results %>%
count(DATASET)
Number of significant hits per dataset:
masst_results %>%
group_by(DATASET) %>%
summarize(N_FEATURES = n_distinct(FEATURE_ID))
Export significant hits for Cytoscape:
masst_results %>%
mutate(`shared name` = FEATURE_ID %>% str_sub(-5, -1) %>% as.integer()) %>%
group_by(DATASET, `shared name`) %>%
summarize(MASST_PHYLA = NAME %>% sort() %>% str_c(collapse = ", "), .groups = "drop_last") %>%
nest() %>%
mutate(
FILE_NAME = DATASET %>% str_remove("_") %>% str_c(".tsv"),
DUMMY = map2(data, FILE_NAME, ~ write_tsv(.x, .y))
)
Read feature annotations from file:
feature_info <- rbind(
read_tsv("feature_metadata/C18neg_feature_metadata_consolidated_is_microbial.tsv", guess_max = 100000) %>%
mutate(MET_CHEM_NO = paste0("X95", formatC(`#featureID`, width = 5, flag = "0", format = "d"))) %>%
mutate(FAMILY_ID = paste0("X95", formatC(GNPS_componentindex, width = 4, flag = "0", format = "d"))),
read_tsv("feature_metadata/C18pos_feature_metadata_consolidated_is_microbial.tsv", guess_max = 100000) %>%
mutate(MET_CHEM_NO = paste0("X94", formatC(`#featureID`, width = 5, flag = "0", format = "d"))) %>%
mutate(FAMILY_ID = paste0("X94", formatC(GNPS_componentindex, width = 4, flag = "0", format = "d"))),
read_tsv("feature_metadata/HILICneg_feature_metadata_consolidated_is_microbial.tsv", guess_max = 100000) %>%
mutate(MET_CHEM_NO = paste0("X97", formatC(`#featureID`, width = 5, flag = "0", format = "d"))) %>%
mutate(FAMILY_ID = paste0("X97", formatC(GNPS_componentindex, width = 4, flag = "0", format = "d"))),
read_tsv("feature_metadata/HILICpos_feature_metadata_consolidated_is_microbial.tsv", guess_max = 100000) %>%
mutate(MET_CHEM_NO = paste0("X96", formatC(`#featureID`, width = 5, flag = "0", format = "d"))) %>%
mutate(FAMILY_ID = paste0("X96", formatC(GNPS_componentindex, width = 4, flag = "0", format = "d")))
) %>%
mutate(FAMILY_ID = if_else(str_detect(FAMILY_ID, "-001$"), "Singleton", FAMILY_ID))
Rows: 6155 Columns: 256
-- Column specification ----------------------------------------------------------------------------------------------------------------------------------
Delimiter: "\t"
chr (159): GNPS_Best Ion, GNPS_GNPSLinkout_Cluster, GNPS_GNPSLinkout_Network, GNPS_INCHI, GNPS_LibraryID, GNPS_MS2 Verification Comment, GNPS_Smiles, ...
dbl (91): #featureID, GNPS_Annotated Adduct Features ID, GNPS_Correlated Features Group ID, GNPS_G1, GNPS_G2, GNPS_G3, GNPS_G4, GNPS_G5, GNPS_G6, GNP...
lgl (6): GNPS_LIB_Pubmed_ID, GNPS_LIB_INCHI_AUX, GNPS_LIB_tags, GNPS_LIBA_INCHI_AUX, GNPS_LIBA_tags, CSI_ConfidenceScore
i Use `spec()` to retrieve the full column specification for this data.
i Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 15131 Columns: 256
-- Column specification ----------------------------------------------------------------------------------------------------------------------------------
Delimiter: "\t"
chr (158): GNPS_GNPSLinkout_Cluster, GNPS_GNPSLinkout_Network, GNPS_INCHI, GNPS_LibraryID, GNPS_Smiles, GNPS_SpectrumID, GNPS_LIB_SpectrumID, GNPS_LIB...
dbl (90): #featureID, GNPS_G1, GNPS_G2, GNPS_G3, GNPS_G4, GNPS_G5, GNPS_G6, GNPS_MQScore, GNPS_RTConsensus, GNPS_RTMean, GNPS_RTStdErr, GNPS_SumPeakI...
lgl (8): GNPS_Annotated Adduct Features ID, GNPS_Best Ion, GNPS_Correlated Features Group ID, GNPS_MS2 Verification Comment, GNPS_neutral M mass, GN...
i Use `spec()` to retrieve the full column specification for this data.
i Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 11230 Columns: 256
-- Column specification ----------------------------------------------------------------------------------------------------------------------------------
Delimiter: "\t"
chr (153): GNPS_GNPSLinkout_Cluster, GNPS_GNPSLinkout_Network, GNPS_INCHI, GNPS_LibraryID, GNPS_Smiles, GNPS_SpectrumID, GNPS_LIB_SpectrumID, GNPS_LIB...
dbl (89): #featureID, GNPS_Correlated Features Group ID, GNPS_G1, GNPS_G2, GNPS_G3, GNPS_G4, GNPS_G5, GNPS_G6, GNPS_MQScore, GNPS_RTConsensus, GNPS_R...
lgl (14): GNPS_Annotated Adduct Features ID, GNPS_Best Ion, GNPS_MS2 Verification Comment, GNPS_neutral M mass, GNPS_LIB_INCHI_AUX, GNPS_LIB_tags, GN...
i Use `spec()` to retrieve the full column specification for this data.
i Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 28762 Columns: 256
-- Column specification ----------------------------------------------------------------------------------------------------------------------------------
Delimiter: "\t"
chr (161): GNPS_Best Ion, GNPS_GNPSLinkout_Cluster, GNPS_GNPSLinkout_Network, GNPS_INCHI, GNPS_LibraryID, GNPS_MS2 Verification Comment, GNPS_Smiles, ...
dbl (92): #featureID, GNPS_Annotated Adduct Features ID, GNPS_Correlated Features Group ID, GNPS_G1, GNPS_G2, GNPS_G3, GNPS_G4, GNPS_G5, GNPS_G6, GNP...
lgl (3): GNPS_LIB_INCHI_AUX, GNPS_LIBA_INCHI_AUX, CSI_ConfidenceScore
i Use `spec()` to retrieve the full column specification for this data.
i Specify the column types or set `show_col_types = FALSE` to quiet this message.
Add FAMILY_ID to the MASST results:
masst_results <- masst_results %>%
inner_join(
feature_info %>%
select(FEATURE_ID = MET_CHEM_NO, FAMILY_ID)
)
Joining, by = "FEATURE_ID"
masst_results
Number of significant hits per dataset:
masst_results %>%
group_by(DATASET) %>%
summarize(N_FAMILIES = n_distinct(FAMILY_ID), n = n())
Read statistical results from file:
skin_p_cat_dir <- read_tsv("Untargeted.p_cat_dir.tsv")
Rows: 33333 Columns: 15
-- Column specification ----------------------------------------------------------------------------------------------------------------------------------
Delimiter: "\t"
chr (15): MET_CHEM_NO, p_cat_dir|base|sebum, p_cat_dir|base|skicon, p_cat_dir|groups|oily-norm, p_cat_dir|groups|skicon, p_cat_dir|oily|exfol, p_cat_d...
i Use `spec()` to retrieve the full column specification for this data.
i Specify the column types or set `show_col_types = FALSE` to quiet this message.
skin_p_value <- read_tsv("Untargeted.p_value.tsv")
Rows: 44567 Columns: 15
-- Column specification ----------------------------------------------------------------------------------------------------------------------------------
Delimiter: "\t"
chr (1): MET_CHEM_NO
dbl (14): p_value|base|sebum, p_value|base|skicon, p_value|groups|oily-norm, p_value|groups|skicon, p_value|oily|exfol, p_value|oily|F-B, p_value|oily...
i Use `spec()` to retrieve the full column specification for this data.
i Specify the column types or set `show_col_types = FALSE` to quiet this message.
skin_p_cat_dir %>% colnames()
[1] "MET_CHEM_NO" "p_cat_dir|base|sebum" "p_cat_dir|base|skicon" "p_cat_dir|groups|oily-norm" "p_cat_dir|groups|skicon"
[6] "p_cat_dir|oily|exfol" "p_cat_dir|oily|F-B" "p_cat_dir|oily|F-B|A" "p_cat_dir|oily|F-B|B" "p_cat_dir|oily|F-B|B-A"
[11] "p_cat_dir|oily|F-B|C" "p_cat_dir|oily|F-B|C-A" "p_cat_dir|oily|F-B|C-B" "p_cat_dir|oily|sebum" "p_cat_dir|oily|skicon"
skin_stats <- skin_p_value %>%
select(MET_CHEM_NO) %>%
left_join(skin_p_cat_dir, by = "MET_CHEM_NO") %>%
mutate(
sebumeter_0.1_any = `p_cat_dir|base|sebum` %>% is.na(.) %>% not(),
sebumeter_0.1_up = `p_cat_dir|base|sebum` %>% is.na(.) %>% not() & `p_cat_dir|base|sebum` %>% str_detect("Up"),
sebumeter_0.1_down = `p_cat_dir|base|sebum` %>% is.na(.) %>% not() & `p_cat_dir|base|sebum` %>% str_detect("Dn")
)
skin_stats %>%
group_by(`p_cat_dir|base|sebum`, sebumeter_0.1_any) %>% summarize(.groups = "drop")
skin_stats %>%
group_by(`p_cat_dir|base|sebum`, sebumeter_0.1_up) %>% summarize(.groups = "drop")
skin_stats %>%
group_by(`p_cat_dir|base|sebum`, sebumeter_0.1_down) %>% summarize(.groups = "drop")
Check whether there are skin stats for all features from the MASST results:
masst_results_stats <- masst_results %>%
inner_join(
skin_stats %>%
select(FEATURE_ID = MET_CHEM_NO, sebumeter_0.1_any, sebumeter_0.1_up, sebumeter_0.1_down),
by = "FEATURE_ID"
)
setdiff(masst_results$FEATURE_ID, skin_stats$MET_CHEM_NO)
character(0)
Calculation of binomial distribution:
The binomial distribution gives the probability that the observed or a higher number of features is correlated with sebumeter score just by chance (false positives).
p_alpha <- 0.1
masst_results_stats %>%
group_by(NAME) %>%
summarize(
N_FEATURES = n(),
N_ANY = sum(sebumeter_0.1_any),
N_POS = sum(sebumeter_0.1_up),
N_NEG = sum(sebumeter_0.1_down)
) %>%
rowwise() %>%
mutate(
PBINOM_ANY = pbinom(N_ANY, N_FEATURES, p_alpha, lower.tail = F) %>% format(digits = 2),
PBINOM_POS = pbinom(N_POS, N_FEATURES, p_alpha, lower.tail = F) %>% format(digits = 2),
PBINOM_NEG = pbinom(N_NEG, N_FEATURES, p_alpha, lower.tail = F) %>% format(digits = 2)
)
Are these enrichments false positives, or are significant correlations with sebumeter score diluted by statistical noise in the phyla with many associated features?
Read statistical results from file:
skin_p_cat_dir <- read_tsv("Aggregated_p_cat_dir.tsv")
Rows: 971 Columns: 16
-- Column specification ----------------------------------------------------------------------------------------------------------------------------------
Delimiter: "\t"
chr (15): FAMILY_ID, p_cat_dir|base|sebum, p_cat_dir|base|skicon, p_cat_dir|groups|oily-norm, p_cat_dir|groups|skicon, p_cat_dir|oily|exfol, p_cat_dir...
dbl (1): row_id
i Use `spec()` to retrieve the full column specification for this data.
i Specify the column types or set `show_col_types = FALSE` to quiet this message.
skin_p_value <- read_tsv("Aggregated_p_value.tsv")
Rows: 1225 Columns: 16
-- Column specification ----------------------------------------------------------------------------------------------------------------------------------
Delimiter: "\t"
chr (1): FAMILY_ID
dbl (15): row_id, p_value|base|sebum, p_value|base|skicon, p_value|groups|oily-norm, p_value|groups|skicon, p_value|oily|exfol, p_value|oily|F-B, p_va...
i Use `spec()` to retrieve the full column specification for this data.
i Specify the column types or set `show_col_types = FALSE` to quiet this message.
skin_p_cat_dir %>% colnames()
[1] "FAMILY_ID" "row_id" "p_cat_dir|base|sebum" "p_cat_dir|base|skicon" "p_cat_dir|groups|oily-norm"
[6] "p_cat_dir|groups|skicon" "p_cat_dir|oily|exfol" "p_cat_dir|oily|F-B" "p_cat_dir|oily|F-B|A" "p_cat_dir|oily|F-B|B"
[11] "p_cat_dir|oily|F-B|B-A" "p_cat_dir|oily|F-B|C" "p_cat_dir|oily|F-B|C-A" "p_cat_dir|oily|F-B|C-B" "p_cat_dir|oily|sebum"
[16] "p_cat_dir|oily|skicon"
skin_stats <- skin_p_value %>%
select(FAMILY_ID) %>%
left_join(skin_p_cat_dir, by = "FAMILY_ID") %>%
mutate(
sebumeter_0.1_any = `p_cat_dir|base|sebum` %>% is.na(.) %>% not(),
sebumeter_0.1_up = `p_cat_dir|base|sebum` %>% is.na(.) %>% not() & `p_cat_dir|base|sebum` %>% str_detect("Up"),
sebumeter_0.1_down = `p_cat_dir|base|sebum` %>% is.na(.) %>% not() & `p_cat_dir|base|sebum` %>% str_detect("Dn")
)
skin_stats %>%
group_by(`p_cat_dir|base|sebum`, sebumeter_0.1_any) %>% summarize(.groups = "drop")
skin_stats %>%
group_by(`p_cat_dir|base|sebum`, sebumeter_0.1_up) %>% summarize(.groups = "drop")
skin_stats %>%
group_by(`p_cat_dir|base|sebum`, sebumeter_0.1_down) %>% summarize(.groups = "drop")
Aggregate by phylum and family:
masst_results_family <- masst_results %>%
group_by(NAME, FAMILY_ID) %>%
summarize(N_FEATURES = n(), .groups = "drop")
masst_results_family
Check whether there are skin stats for all families from the MASST results:
masst_results_stats <- masst_results_family %>%
inner_join(
skin_stats %>%
select(FAMILY_ID, sebumeter_0.1_any, sebumeter_0.1_up, sebumeter_0.1_down),
by = "FAMILY_ID"
)
setdiff(masst_results$FAMILY_ID, skin_stats$FAMILY_ID)
[1] "X970885" "X961864" "X950468" "X940057"
Why are these families missing?
feature_info %>%
filter(FAMILY_ID %in% c("X970885", "X961864", "X950468", "X940057")) %>%
count(FAMILY_ID)
Calculation of binomial distribution:
The binomial distribution gives the probability that the observed or a higher number of families is correlated with sebumeter score just by chance (false positives).
p_alpha <- 0.1
masst_results_stats %>%
group_by(NAME) %>%
summarize(
N_FEATURES = sum(N_FEATURES),
N_FAMILIES = n(),
N_ANY = sum(sebumeter_0.1_any),
N_POS = sum(sebumeter_0.1_up),
N_NEG = sum(sebumeter_0.1_down)
) %>%
rowwise() %>%
mutate(
PBINOM_CHANGED = pbinom(N_ANY, N_FAMILIES, p_alpha, lower.tail = F) %>% format(digits = 2),
PBINOM_UP = pbinom(N_POS, N_FAMILIES, p_alpha, lower.tail = F) %>% format(digits = 2),
PBINOM_DOWN = pbinom(N_NEG, N_FAMILIES, p_alpha, lower.tail = F) %>% format(digits = 2)
)
These are different phyla than found in the enrichment analysis on features, which is another indication that the enrichment may not be meaningful. Individual associations may of course still be meaningful.
sessionInfo()
R version 4.1.2 (2021-11-01)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19044)
Matrix products: default
locale:
[1] LC_COLLATE=English_United States.1252 LC_CTYPE=English_United States.1252 LC_MONETARY=English_United States.1252
[4] LC_NUMERIC=C LC_TIME=English_United States.1252
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] broom_0.7.11 jsonlite_1.7.2 magrittr_2.0.1 forcats_0.5.1 stringr_1.4.0 dplyr_1.0.7 purrr_0.3.4 readr_2.1.1 tidyr_1.1.4
[10] tibble_3.1.6 ggplot2_3.3.5 tidyverse_1.3.2
loaded via a namespace (and not attached):
[1] Rcpp_1.0.8 lubridate_1.8.0 assertthat_0.2.1 digest_0.6.29 utf8_1.2.2 R6_2.5.1 cellranger_1.1.0
[8] backports_1.4.1 reprex_2.0.1 evaluate_0.19 httr_1.4.2 pillar_1.6.4 rlang_0.4.12 googlesheets4_1.0.0
[15] readxl_1.3.1 rstudioapi_0.13 jquerylib_0.1.4 rmarkdown_2.11 labeling_0.4.2 googledrive_2.0.0 bit_4.0.4
[22] munsell_0.5.0 tinytex_0.36 compiler_4.1.2 modelr_0.1.8 xfun_0.35 pkgconfig_2.0.3 htmltools_0.5.2
[29] tidyselect_1.1.1 fansi_0.5.0 crayon_1.4.2 tzdb_0.2.0 dbplyr_2.1.1 withr_2.4.3 grid_4.1.2
[36] gtable_0.3.0 lifecycle_1.0.1 DBI_1.1.2 scales_1.1.1 cli_3.1.0 stringi_1.7.6 vroom_1.5.7
[43] farver_2.1.0 fs_1.5.2 xml2_1.3.3 ellipsis_0.3.2 generics_0.1.1 vctrs_0.3.8 tools_4.1.2
[50] bit64_4.0.5 glue_1.6.0 hms_1.1.1 parallel_4.1.2 fastmap_1.1.0 yaml_2.2.1 colorspace_2.0-2
[57] gargle_1.2.0 rvest_1.0.2 knitr_1.41 haven_2.4.3